###### Replication script of in-text claims
###### Do Indirect Taxes Bite? How Hiding Taxes Erases Accountability Demands from Citizens
gc(); rm(list = ls()); set.seed(12345)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) # Note: if you are not using R Studio this command will not work, set WD to source file location manually
require(dplyr)
require(tidyr)
require(stargazer)
require(plotrix)
require(gmodels)
require(ggplot2)
source("functions.R")

## Load all datasets
load("data/cleaned/vat_panel_cleaned15Nov.RData")
dfLab <- read.csv("data/cleaned/uganda_jun17_cleaned.csv",  
                  stringsAsFactors = F)
dfSurv <- read.csv("data/cleaned/uganda_jun18_cleaned.csv", 
                   stringsAsFactors = F)


######################### IN-TEXT CLAIMS ###########################
### "In our own survey data, only 7.5\% of Ugandans reported paying income tax"
# % of respondents paying income tax in Uganda
mean(dfSurv$tm_paid_income, na.rm = T)

### "direct taxes have increased 0.6 percentage points..."
# Change in direct taxes as % of GDP
load("results/fig_1.RData")
dfPlot$est[dfPlot$var == "direct" & dfPlot$year == 2017] - 
  dfPlot$est[dfPlot$var == "direct" & dfPlot$year == 1980]

### "...compared to 3 percentage points for indirect taxes"
# Change in indirect taxes as % of GDP
dfPlot$est[dfPlot$var == "indirect" & dfPlot$year == 2017] - 
  dfPlot$est[dfPlot$var == "indirect" & dfPlot$year == 1980]

### "This section tests this prediction using a panel 
###  dataset of 194 countries from 1980 to 2018"
# Total number of countries and time period
min(dfTax$year); max(dfTax$year)
length(unique(dfTax$ccode))


# Name of core control covariates
load("results/main_xnat_results.RData")

c(dfVar$label[dfVar$var_group == "key"], 
  "Total Revenue as % of GDP", 
  "Nontax Revenues as % of GDP")


# Number of aux variables
length(dfVar$var_name[dfVar$var_group == "additional"])

#  "A Kolmogorov-Smirnov test rejects the null that the two distributions in each panel are the same"
accDir <- mainRes$acc_vert$plot$data %>% 
  dplyr::filter(var_name == "Direct Taxes (% GDP)")

accIndir <- mainRes$acc_vert$plot$data %>%
  dplyr::filter(var_name == "Indirect Taxes (% GDP)")
ks.test(accDir$est, accIndir$est)

corrDir <- mainRes$v2x_corr$plot$data %>% 
  dplyr::filter(var_name == "Direct Taxes (% GDP)")

corrIndir <- mainRes$v2x_corr$plot$data %>%
  dplyr::filter(var_name == "Indirect Taxes (% GDP)")

ks.test(corrDir$est, corrIndir$est)


##  All models pass the modified Durbin-Watson test for AR(1) autocorrelation in unbalanced panels
# Modified Durbin-Watson Statistic
.rs.restartR()
gc()
rm(list = ls())
set.seed(12345)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
source("functions.R")
source("data/working_eba_function_dwtest.R")
load("data/cleaned/vat_panel_cleaned15Nov.RData")
require(pacman)
pacman::p_load(plm, sandwich, clubSandwich, lmtest, 
               ggplot2, numbers, dplyr, Hmisc, formattable, 
               htmltools, webshot, ggpubr, panelView, fixest, plyr, 
               scales, DataCombine, plm)
numSims <- 4943
depUse <- c("acc_vert", "v2x_corr")
depNames <- c("Vertical Accountability", "Control of Corruption Index")
comPlm <- eba_fit_dw(dep.vars = depUse, 
                     dep.names = depNames,
                     samp.num = numSims,
                     data = dfMerge)
summary(comPlm$acc_vert$modelplm$arStat)
summary(comPlm$v2x_corr$modelplm$arStat)
save(list = c("comPlm"), 
     file = paste0("results/dwtest.RData"))


## % of each sign and significance for indirect and direct
##     taxation coefficients
gc()
rm(list = ls())
set.seed(12345)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
source("functions.R")
load("results/main_xnat_results.RData")
mainRes$acc_vert$sig_avg[2:3,]
mainRes$acc_vert$sig_weighted[2:3,]


## Average coefficient values for direct/indirect
mainRes$acc_vert$coefs_avg[2:3,]
mainRes$acc_vert$coefs_weighted[2:3,]


## "We conducted 72 sessions of 16 respondents in Kampala, Uganda"
## Total number of sessions in lab game
gc()
rm(list = ls())
set.seed(12345)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
source("functions.R")
load("data/cleaned/vat_panel_cleaned15Nov.RData")
dfLab <- read.csv("data/cleaned/uganda_jun17_cleaned.csv",  
                  stringsAsFactors = F)
dfSurv <- read.csv("data/cleaned/uganda_jun18_cleaned.csv", 
                   stringsAsFactors = F)
length(unique(dfLab$day_session))


## "debrief questions show 84.8\% of respondents believed the games were single-shot"
## % of people who passed leader match manipulation test
1 - mean(dfLab$same_leader[dfLab$round == 2] == 1, na.rm = T)

## "Post-treatment, 68.6% of respondents could correctly identify 
## the source of the group fund in the Visible VAT condition compared 
## to 29.8% in the Hidden VAT condition."
## "In the Direct tax condition 99% of subjects correctly identified the source of the group fund"
##  % passing manipulation check in each group
dfUse <- dfLab[dfLab$round == 2, ]
tapply(dfUse$mcheck_source, dfUse$treatment, mean, na.rm = T)

## "Respondents first earned 2,600 UGX through an effort task, then chose to 
## purchase either some soap or a cellphone airtime voucher for the good's
## actual local market price, typically 500-700 UG"
## Price of good purchased in survey experiment
prop.table(table(dfSurv$vm_purchase_price))

## Figure 3 output - Average ladder position for each treatment condition
tapply(dfSurv$vm_ladder, dfSurv$rand_vm, mean, na.rm = T)


############# APPENDIX CLAIMS

gc(); rm(list = ls()); set.seed(12345)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) # Note: if you are not using R Studio this command will not work, set WD to source file location manually
require(dplyr)
require(tidyr)
require(stargazer)
require(plotrix)
require(gmodels)
require(ggplot2)
source("functions.R")

## Load all datasets
load("data/cleaned/vat_panel_cleaned15Nov.RData")
dfLab <- read.csv("data/cleaned/uganda_jun17_cleaned.csv",  
                  stringsAsFactors = F)
dfSurv <- read.csv("data/cleaned/uganda_jun18_cleaned.csv", 
                   stringsAsFactors = F)
load("results/main_xnat_results.RData")

# 1. Name of additional covariates:
dfVar$label[dfVar$var_group == "additional"]

# 2. Min and max number of observations
range(mainRes$acc_vert$model$nobs)
range(mainRes$v2x_corr$model$nobs)

# Revenue missingness
dfPropmiss <- as.data.frame(propMiss)
taxvars <- dfVar$var_name[dfVar$source == "ICTD"]
dfPropmissICTD <- dfPropmiss %>% 
  select(all_of(taxvars))
range(dfPropmissICTD)

# 3. Mean missingness on additional covariates:
additvars <- dfVar$var_name[dfVar$var_group == "additional"]

dfPropmissaddit <- dfPropmiss %>% 
  select(all_of(additvars))
rowMeans(dfPropmissaddit)


# 4. Mean missingness on core covariates

keyvars <- dfVar$var_name[dfVar$var_group == "key"]
dfPropmisskey <- dfPropmiss %>% 
  select(all_of(keyvars), rev_ex_gr_ex_sc, nontax)
rowMeans(dfPropmisskey)

# Region counts
dfMerge %>%
  group_by(region) %>% 
  dplyr::summarise(volume = n()) %>%
  arrange(desc(volume))

